分别将训练和测试数据读取出来
#清理当前工作空间
rm(list=ls())
#使用路径方法,解决写死路径的问题
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#读取数据
data=read.csv("aug_train.csv",header=T) #读取csv格式的数据,并赋值给data
data_train=read.csv("aug_test.csv",header=T) #读取csv格式的数据,并赋值给data
#数据清理,删除雇佣id和城市id,对于数据分析没有意义
data=data[,-(1:2)] #删除第1列到第2列的数据(-:代表删除(*,*)第1个*表示行,第2个*表示列)
data_train=data_train[,-(1:2)]
names(data)=c("city_dev","gender","expenience","university","edu_level","major","expenience_year","company_size","company_type","jobs","train_hours","target") #重新命名
names(data_train)=c("city_dev","gender","expenience","university","edu_level","major","expenience_year","company_size","company_type","jobs","train_hours","target") #重新命名
#构造一个函数 参数v-一列数据 功能--获取数据中的众数)
getmode <- function(v) {
uniqv <- unique(v) # 获取唯一值
uniqv[which.max(tabulate(match(v, uniqv)))] # 得到众数
}
# 统计函数
descrb = function(var){
Z=data[,var]
N=tapply(data$target,Z,length)
MU=tapply(data$target,Z,mean)
SD=tapply(data$target,Z,sd)
MIN=tapply(data$target,Z,min)
MED=tapply(data$target,Z,median)
MAX=tapply(data$target,Z,max)
out=cbind(N,MU,SD,MIN,MED,MAX)
out
}
对于数值类型变量,使用均值进行填充;对于分类变量,使用众数进行填充。 这里我们得到了训练数据员工离职率的均值为0.249,后面进行预测的时候会以此阈值进行计算。
city_dev_avg=mean(data[,1]) #获取表中city_dev的平均数
gender_med=getmode(data[,2]) #获取表中gender的众数
expenience_med=getmode(data[,3]) #获取表中expenience的众数
university_med=getmode(data[,4]) #获取表中university的众数
edu_level_med=getmode(data[,5]) #获取表中edu_level的众数
major_med=getmode(data[,6]) #获取表中major的众数
expenience_year_med=getmode(data[,7]) #获取表中expenience_year的众数
# company_size_med=getmode(data[,8]) #获取表中company_size的众数,目前发现空字符串是最多的,这里就虚拟出一个other
company_size_med="Other" # 虚拟出other
company_type_med=getmode(data[,9]) #获取表中company_type的众数
jobs_med=getmode(data[,10]) #获取表中jobs的众数
train_hours_med=mean(data[,11]) #获取表中train_hours的平均数
summary(data$target)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.249 0.000 1.000
# 查看一看员工离职率的均值,目前是 0.249,也就是说,如果大于0.249。就有离职的可能性
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 0.000 0.000 0.000 0.249 0.000 1.000
# 数据处理
data[which(data$gender==""),"gender"] = gender_med # 为空的补上众值
data[which(data$expenience==""),"expenience"] = expenience_med # 为空的补上众值
data[which(data$university==""),"university"] = university_med # 为空的补上众值
data[which(data$edu_level==""),"edu_level"] = edu_level_med # 为空的补上众值
data[which(data$major==""),"major"] = major_med # 为空的补上众值
data[which(data$expenience_year==""),"expenience_year"] = expenience_year_med # 为空的补上众值
data[which(data$company_size==""),"company_size"] = company_size_med # 为空的补上众值
data[which(data$company_type==""),"company_type"] = company_type_med # 为空的补上众值
data[which(data$jobs==""),"jobs"] = jobs_med # 为空的补上众值
data$train_hours_scale=log(data$train_hours) # 先对数据进行对数计算,然后再进行标准化,为了和城市发展指数统一量纲
# 测试数据
data_train[which(data_train$gender==""),"gender"] = gender_med # 为空的补上众值
data_train[which(data_train$expenience==""),"expenience"] = expenience_med # 为空的补上众值
data_train[which(data_train$university==""),"university"] = university_med # 为空的补上众值
data_train[which(data_train$edu_level==""),"edu_level"] = edu_level_med # 为空的性别补上众值
data_train[which(data_train$major==""),"major"] = major_med # 为空的性别补上众值
data_train[which(data_train$expenience_year==""),"expenience_year"] = expenience_year_med # 为空的性别补上众值
data_train[which(data_train$company_size==""),"company_size"] = company_size_med # 为空的性别补上众值
data_train[which(data_train$company_type==""),"company_type"] = company_type_med # 为空的性别补上众值
data_train[which(data_train$jobs==""),"jobs"] = jobs_med # 为空的性别补上众值
data_train$train_hours_scale=log(data_train$train_hours) # 数据归一化,为了和测试发展指数量纲统一
# 查看序列数据情况
unique(data$gender) # "Male" "Female" "Other"
## [1] "Male" "Female" "Other"
unique(data$expenience) # "Has relevent experience" "No relevent experience"
## [1] "Has relevent experience" "No relevent experience"
unique(data$university) # "no_enrollment" "Full time course" "Part time course"
## [1] "no_enrollment" "Full time course" "Part time course"
unique(data$edu_level) # "Graduate" "Masters" "High School" "Phd" "Primary School"
## [1] "Graduate" "Masters" "High School" "Phd"
## [5] "Primary School"
unique(data$major) # "STEM" "Business Degree" "Arts" "Humanities" "No Major" "Other"
## [1] "STEM" "Other" "No Major" "Business Degree"
## [5] "Arts" "Humanities"
sort(unique(data$expenience_year)) # "<1" ">20" "1" "10" "11" "12" "13" "14" "15" "16" "17" "18" "19" "2" "20" "3" "4" "5" "6" "7" "8" "9"
## [1] "<1" ">20" "1" "10" "11" "12" "13" "14" "15" "16" "17" "18"
## [13] "19" "2" "20" "3" "4" "5" "6" "7" "8" "9"
unique(data$company_size) # "Others" "50-99" "<10" "10000+" "5000-9999" "1000-4999" "10/49" "100-500" "500-999"
## [1] "500-999" "100-500" "50-99" "1000-4999" "10000+" "Other"
## [7] "10/49" "5000-9999" "<10"
unique(data$company_type) # "Pvt Ltd" "Funded Startup" "Early Stage Startup" "Other" "Public Sector" "NGO"
## [1] "Pvt Ltd" "Funded Startup" "Public Sector"
## [4] "NGO" "Early Stage Startup" "Other"
unique(data$jobs) # "1" ">4" "never" "4" "3" "2"
## [1] ">4" "2" "1" "3" "4" "never"
对于数据量较小的样本,直接合并到Other里面去。
# 性别 男性远远大于女性
barplot(table(data$gender), xlab="gender", col=rainbow(3, alpha = 0.4))
# 查看总样本占比
gender_data <- data %>%
group_by(gender) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
gender_data$label <- scales::percent(gender_data$per)
label_gender = c("Fmale", "Male", "Other")
ggplot(data=gender_data)+
geom_bar(aes(x="", y=per, fill=label_gender), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(gender))) + geom_bar(position = "fill")
# 把Female改成Other
data[which(data$gender=="Female"),"gender"] = "Other"
data_train[which(data_train$gender=="Female"),"gender"] = "Other"
# 重新查看处理后的效果
barplot(table(data$gender),xlab="gender", col=rainbow(3, alpha = 0.4))
gender_data <- data %>%
group_by(gender) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
gender_data$label <- scales::percent(gender_data$per)
label_gender = c("Male", "Other")
ggplot(data=gender_data)+
geom_bar(aes(x="", y=per, fill=label_gender), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
ggplot(data, aes(factor(target), fill = factor(gender))) + geom_bar(position = "fill")
# 相关经验 有相关工作经验的比较多
barplot(table(data$expenience),xlab="expenience", col=rainbow(2, alpha = 0.4))
# 查看总样本占比
expenience_data <- data %>%
group_by(expenience) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
expenience_data$label <- scales::percent(expenience_data$per)
label_expenience = c("Has relevent experience", "No relevent experience")
ggplot(data=expenience_data)+
geom_bar(aes(x="", y=per, fill=label_expenience), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(expenience))) + geom_bar(position = "fill")
# 大学入学情况 很多都没有上大学
barplot(table(data$university),xlab="university", col=rainbow(3, alpha = 0.4))
# 查看总样本占比
university_data <- data %>%
group_by(university) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
university_data$label <- scales::percent(university_data$per)
label_university = c("Full time course", "No enrollment", "Part time course")
ggplot(data=university_data)+
geom_bar(aes(x="", y=per, fill=label_university), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(university))) + geom_bar(position = "fill")
# 把Fulltime和Partime合并,改成enrollment
data[which(data$university=="Full time course" | data$university=="Part time course"),"university"] = "enrollment"
data_train[which(data_train$university=="Full time course" | data_train$university=="Part time course"),"university"] = university_med
# 大学入学情况 很多都没有上大学
barplot(table(data$university),xlab="university", col=rainbow(3, alpha = 0.4))
# 查看总样本占比
university_data <- data %>%
group_by(university) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
university_data$label <- scales::percent(university_data$per)
label_university = c("Enrollment", "No enrollment")
ggplot(data=university_data)+
geom_bar(aes(x="", y=per, fill=label_university), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(university))) + geom_bar(position = "fill")
# 教育水平
barplot(table(data$edu_level),xlab="edu_level", col=rainbow(5, alpha = 0.4))
# 查看总样本占比
edu_level_data <- data %>%
group_by(edu_level) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
edu_level_data$label <- scales::percent(edu_level_data$per)
label_edu_level = c("Gradule", "High School", "Masters", "Phd", "Primay School")
ggplot(data=edu_level_data)+
geom_bar(aes(x="", y=per, fill=label_edu_level), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(edu_level))) + geom_bar(position = "fill")
# 专业大多数都是理工科
barplot(table(data$major),xlab="major", col=rainbow(6, alpha = 0.4))
# 查看总样本占比
major_data <- data %>%
group_by(major) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
major_data$label <- scales::percent(major_data$per)
label_major = c("Arts", "Business Degree", "Humanities", "No Major", "Other", "STEM" )
ggplot(data=major_data)+
geom_bar(aes(x="", y=per, fill=label_major), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(major))) + geom_bar(position = "fill")
# 将其他专业数据都合并到Other
data[which(data$major=="Business Degree" | data$major=="Arts" | data$major=="Humanities" | data$major=="No Major"),"major"] = "Other"
data_train[which(data_train$major=="Business Degree" | data_train$major=="Arts" | data_train$major=="Humanities" | data_train$major=="No Major"),"major"] = "Other"
# 查看处理后的数据
barplot(table(data$major),xlab="major", col=rainbow(6, alpha = 0.4))
# 查看总样本占比
major_data <- data %>%
group_by(major) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
major_data$label <- scales::percent(major_data$per)
label_major = c("Other", "STEM" )
ggplot(data=major_data)+
geom_bar(aes(x="", y=per, fill=label_major), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(major))) + geom_bar(position = "fill")
# 工作经验,基本上都是STEM理工科
barplot(table(data$expenience_year),xlab="expenience year", col=rainbow(21, alpha = 0.4))
# 通过分析,可以考虑将工作经验做离散化处理
data[which(data$expenience_year=="<1"),"expenience_year"] = 0 # 1年以下工作经验归到0,认为没有工作经验
data[which(data$expenience_year==">20"),"expenience_year"] = 21 # 大于20用虚拟21来代替
# 排序查看数据分别
data$expenience_year=as.numeric(data$expenience_year)
barplot(table(sort(data$expenience_year)),xlab="expenience year", col=rainbow(21, alpha = 0.4))
ggplot(data, aes(factor(target), fill = factor(expenience_year))) + geom_bar(position = "fill")
# 按照1-5,5-10,10-20,20+ 四个段进行数据划分
data$expenience_year_new = 1 * (data$expenience_year<1) + 2 * (data$expenience_year>=1 & data$expenience_year<=4) + 3 * (data$expenience_year >4)
barplot(table(sort(data$expenience_year_new)),xlab="expenience year", col=rainbow(3, alpha = 0.4))
expenience_year_new_data <- data %>%
group_by(expenience_year_new) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
expenience_year_new_data$label <- scales::percent(expenience_year_new_data$per)
label_expenience_year_new = c( "1-4", "4+", "Never" )
ggplot(data=expenience_year_new_data)+
geom_bar(aes(x="", y=per, fill=label_expenience_year_new), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(expenience_year_new))) + geom_bar(position = "fill")
# 测试数据
data_train[which(data_train$expenience_year=="<1"),"expenience_year"] = 0
data_train[which(data_train$expenience_year==">20"),"expenience_year"] = 21
# 排序查看数据分别
data_train$expenience_year=as.numeric(data_train$expenience_year)
# 按照1-5,5-10,10-20,20+ 四个段进行数据划分
data_train$expenience_year_new = 1 * (data_train$expenience_year<1) + 2 * (data_train$expenience_year>=1 & data_train$expenience_year<=4) + 3 * (data_train$expenience_year >4)
# 公司大小
barplot(table(data$company_size),xlab="company size", col=rainbow(9, alpha = 0.4))
company_size_data <- data %>%
group_by(company_size) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
company_size_data$label <- scales::percent(company_size_data$per)
label_company_size = c("<10", "10/49", "100-500", "1000-4999", "10000+", "50-99", "500-999", "5000-9999", "Other")
ggplot(data=company_size_data)+
geom_bar(aes(x="", y=per, fill=label_company_size), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(company_size))) + geom_bar(position = "fill")
# 公司类型
barplot(table(data$company_type),xlab="company type", col=rainbow(6, alpha = 0.4))
company_type_data <- data %>%
group_by(company_type) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
company_type_data$label <- scales::percent(company_type_data$per)
label_company_size = c("Early Stage Startup", "Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd")
ggplot(data=company_type_data)+
geom_bar(aes(x="", y=per, fill=label_company_size), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(company_type))) + geom_bar(position = "fill")
# 数据处理
data[which(data$company_type=="Funded Startup" | data$company_type=="Early Stage Startup" | data$company_type=="Public Sector" | data$company_type=="NGO"),"company_type"] = "Other"
data_train[which(data_train$company_type=="Funded Startup" | data_train$company_type=="Early Stage Startup" | data_train$company_type=="Public Sector" | data_train$company_type=="NGO"),"company_type"] = "Other"
# 查看数据情况
barplot(table(data$company_type),xlab="company type", col=rainbow(6, alpha = 0.4))
company_type_data <- data %>%
group_by(company_type) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
company_type_data$label <- scales::percent(company_type_data$per)
label_company_type = c("Other", "Pvt Ltd")
ggplot(data=company_type_data)+
geom_bar(aes(x="", y=per, fill=label_company_type), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
ggplot(data, aes(factor(target), fill = factor(company_type))) + geom_bar(position = "fill")
# 工作更换次数
barplot(table(data$jobs),xlab="jobs", col=rainbow(6, alpha = 0.4))
jobs_data <- data %>%
group_by(jobs) %>%
count() %>%
ungroup() %>%
mutate(per=`n`/sum(`n`))
jobs_data$label <- scales::percent(jobs_data$per)
label_jobs = c(">4", "1", "2", "3", "4", "Never")
ggplot(data=jobs_data)+
geom_bar(aes(x="", y=per, fill=label_jobs), stat="identity", width = 1)+
coord_polar("y", start=0)+
theme_void()+
geom_text(aes(x=1, y = cumsum(per) - per/2, label=label))
# 查看离职情况的占比
ggplot(data, aes(factor(target), fill = factor(jobs))) + geom_bar(position = "fill")
# 数字类型数据情况
barplot(table(data$train_hours),xlab="train hours", col=rainbow(100, alpha = 0.4))
# 未take log之前,离群值较多
boxplot(train_hours~target, data, ylab="train hours",xlab="target", col=rainbow(2, alpha = 0.4), outline=TRUE)
# take log后,数据变的比较规范
boxplot(log(train_hours)~target, data, ylab="train hours",xlab="target", col=rainbow(2, alpha = 0.4), outline=TRUE)
write.csv(data, "aug_train_after.csv", row.names = FALSE)
descrb("gender")
## N MU SD MIN MED MAX
## Male 15890 0.2481435 0.4319491 0 0 1
## Other 1268 0.2594637 0.4385133 0 0 1
descrb("expenience")
## N MU SD MIN MED MAX
## Has relevent experience 12354 0.2133722 0.4097049 0 0 1
## No relevent experience 4804 0.3405495 0.4739434 0 0 1
descrb("university")
## N MU SD MIN MED MAX
## enrollment 4457 0.3479919 0.4763869 0 0 1
## no_enrollment 12701 0.2142351 0.4103068 0 0 1
descrb("edu_level")
## N MU SD MIN MED MAX
## Graduate 10776 0.2769117 0.4474932 0 0 1
## High School 1805 0.1961219 0.3971719 0 0 1
## Masters 3934 0.2155567 0.4112603 0 0 1
## Phd 368 0.1385870 0.3459854 0 0 1
## Primary School 275 0.1272727 0.3338859 0 0 1
descrb("major")
## N MU SD MIN MED MAX
## Other 1663 0.2423331 0.4286237 0 0 1
## STEM 15495 0.2496934 0.4328495 0 0 1
descrb("expenience_year_new")
## N MU SD MIN MED MAX
## 1 478 0.4518828 0.4982008 0 0 1
## 2 3939 0.3445037 0.4752665 0 0 1
## 3 12741 0.2118358 0.4086251 0 0 1
descrb("company_size")
## N MU SD MIN MED MAX
## <10 1180 0.1703390 0.3760897 0 0 1
## 10/49 1316 0.2355623 0.4245111 0 0 1
## 100-500 2293 0.1604884 0.3671385 0 0 1
## 1000-4999 1201 0.1507077 0.3579128 0 0 1
## 10000+ 1796 0.1932071 0.3949240 0 0 1
## 50-99 2767 0.1752801 0.3802752 0 0 1
## 500-999 789 0.1711027 0.3768375 0 0 1
## 5000-9999 511 0.1819961 0.3862194 0 0 1
## Other 5305 0.4056550 0.4910646 0 0 1
descrb("company_type")
## N MU SD MIN MED MAX
## Other 2845 0.1926186 0.3944254 0 0 1
## Pvt Ltd 14313 0.2601831 0.4387497 0 0 1
descrb("jobs")
## N MU SD MIN MED MAX
## >4 2961 0.1789936 0.3834117 0 0 1
## 1 7588 0.2688455 0.4433887 0 0 1
## 2 2591 0.2392898 0.4267324 0 0 1
## 3 921 0.2280130 0.4197790 0 0 1
## 4 925 0.2259459 0.4184300 0 0 1
## never 2172 0.3052486 0.4606187 0 0 1
descrb("train_hours")
## N MU SD MIN MED MAX
## 1 9 0.22222222 0.4409586 0 0.0 1
## 2 89 0.23595506 0.4269999 0 0.0 1
## 3 102 0.26470588 0.4433551 0 0.0 1
## 4 194 0.17010309 0.3766955 0 0.0 1
## 5 97 0.23711340 0.4275218 0 0.0 1
## 6 235 0.25957447 0.4393369 0 0.0 1
## 7 185 0.29189189 0.4558666 0 0.0 1
## 8 191 0.23036649 0.4221740 0 0.0 1
## 9 208 0.30769231 0.4626519 0 0.0 1
## 10 228 0.27631579 0.4481588 0 0.0 1
## 11 209 0.20574163 0.4052131 0 0.0 1
## 12 265 0.20754717 0.4063180 0 0.0 1
## 13 194 0.31443299 0.4654909 0 0.0 1
## 14 192 0.24479167 0.4310877 0 0.0 1
## 15 204 0.25000000 0.4340779 0 0.0 1
## 16 168 0.26785714 0.4441666 0 0.0 1
## 17 243 0.25925926 0.4391326 0 0.0 1
## 18 264 0.25000000 0.4338351 0 0.0 1
## 19 143 0.27972028 0.4504394 0 0.0 1
## 20 246 0.23170732 0.4227832 0 0.0 1
## 21 230 0.27391304 0.4469376 0 0.0 1
## 22 250 0.22800000 0.4203842 0 0.0 1
## 23 237 0.23628692 0.4256995 0 0.0 1
## 24 238 0.25630252 0.4375109 0 0.0 1
## 25 184 0.27717391 0.4488244 0 0.0 1
## 26 229 0.27510917 0.4475475 0 0.0 1
## 27 102 0.26470588 0.4433551 0 0.0 1
## 28 293 0.22866894 0.4206941 0 0.0 1
## 29 160 0.26250000 0.4413744 0 0.0 1
## 30 168 0.25595238 0.4376998 0 0.0 1
## 31 165 0.20606061 0.4057058 0 0.0 1
## 32 182 0.30769231 0.4628117 0 0.0 1
## 33 135 0.23703704 0.4268490 0 0.0 1
## 34 235 0.26382979 0.4416488 0 0.0 1
## 35 142 0.23239437 0.4238542 0 0.0 1
## 36 189 0.26455026 0.4422650 0 0.0 1
## 37 146 0.23287671 0.4241193 0 0.0 1
## 38 108 0.25000000 0.4350314 0 0.0 1
## 39 160 0.28125000 0.4510209 0 0.0 1
## 40 172 0.28488372 0.4526769 0 0.0 1
## 41 128 0.17968750 0.3854355 0 0.0 1
## 42 210 0.23809524 0.4269354 0 0.0 1
## 43 179 0.28491620 0.4526407 0 0.0 1
## 44 183 0.26775956 0.4440064 0 0.0 1
## 45 156 0.27564103 0.4482758 0 0.0 1
## 46 202 0.25742574 0.4383021 0 0.0 1
## 47 134 0.24626866 0.4324535 0 0.0 1
## 48 215 0.26511628 0.4424252 0 0.0 1
## 49 45 0.24444444 0.4346135 0 0.0 1
## 50 247 0.30364372 0.4607643 0 0.0 1
## 51 165 0.32121212 0.4683635 0 0.0 1
## 52 178 0.21348315 0.4109218 0 0.0 1
## 53 117 0.26495726 0.4432086 0 0.0 1
## 54 147 0.22448980 0.4186724 0 0.0 1
## 55 153 0.20915033 0.4080376 0 0.0 1
## 56 231 0.29004329 0.4547675 0 0.0 1
## 57 127 0.26771654 0.4445226 0 0.0 1
## 58 122 0.27049180 0.4460457 0 0.0 1
## 59 62 0.16129032 0.3708010 0 0.0 1
## 60 85 0.21176471 0.4109837 0 0.0 1
## 61 90 0.23333333 0.4253221 0 0.0 1
## 62 114 0.30701754 0.4632932 0 0.0 1
## 63 70 0.28571429 0.4550158 0 0.0 1
## 64 127 0.23622047 0.4264414 0 0.0 1
## 65 71 0.21126761 0.4111132 0 0.0 1
## 66 103 0.20388350 0.4048535 0 0.0 1
## 67 86 0.23255814 0.4249406 0 0.0 1
## 68 104 0.33653846 0.4748137 0 0.0 1
## 69 73 0.21917808 0.4165525 0 0.0 1
## 70 121 0.17355372 0.3803000 0 0.0 1
## 71 15 0.13333333 0.3518658 0 0.0 1
## 72 132 0.20454545 0.4049057 0 0.0 1
## 73 56 0.23214286 0.4260205 0 0.0 1
## 74 111 0.23423423 0.4254400 0 0.0 1
## 75 51 0.29411765 0.4601790 0 0.0 1
## 76 72 0.29166667 0.4577194 0 0.0 1
## 77 79 0.24050633 0.4301219 0 0.0 1
## 78 150 0.36000000 0.4816080 0 0.0 1
## 79 58 0.20689655 0.4086186 0 0.0 1
## 80 130 0.24615385 0.4324357 0 0.0 1
## 81 55 0.23636364 0.4287638 0 0.0 1
## 82 87 0.19540230 0.3988087 0 0.0 1
## 83 75 0.18666667 0.3922676 0 0.0 1
## 84 100 0.22000000 0.4163332 0 0.0 1
## 85 54 0.16666667 0.3761774 0 0.0 1
## 86 86 0.20930233 0.4091966 0 0.0 1
## 87 49 0.36734694 0.4870779 0 0.0 1
## 88 77 0.28571429 0.4547163 0 0.0 1
## 89 61 0.26229508 0.4435328 0 0.0 1
## 90 102 0.19607843 0.3989892 0 0.0 1
## 91 63 0.28571429 0.4553826 0 0.0 1
## 92 91 0.20879121 0.4086967 0 0.0 1
## 94 107 0.22429907 0.4190828 0 0.0 1
## 95 34 0.32352941 0.4748581 0 0.0 1
## 96 112 0.27678571 0.4494205 0 0.0 1
## 97 38 0.15789474 0.3695370 0 0.0 1
## 98 68 0.19117647 0.3961514 0 0.0 1
## 99 42 0.21428571 0.4152997 0 0.0 1
## 100 100 0.22000000 0.4163332 0 0.0 1
## 101 34 0.29411765 0.4624973 0 0.0 1
## 102 126 0.32539683 0.4703933 0 0.0 1
## 103 29 0.27586207 0.4548588 0 0.0 1
## 104 59 0.27118644 0.4483882 0 0.0 1
## 105 59 0.11864407 0.3261450 0 0.0 1
## 106 87 0.19540230 0.3988087 0 0.0 1
## 107 53 0.33962264 0.4781131 0 0.0 1
## 108 83 0.20481928 0.4060228 0 0.0 1
## 109 53 0.30188679 0.4634696 0 0.0 1
## 110 67 0.23880597 0.4295717 0 0.0 1
## 111 55 0.34545455 0.4798990 0 0.0 1
## 112 88 0.29545455 0.4588614 0 0.0 1
## 113 40 0.20000000 0.4050957 0 0.0 1
## 114 61 0.31147541 0.4669398 0 0.0 1
## 116 56 0.35714286 0.4834938 0 0.0 1
## 117 21 0.33333333 0.4830459 0 0.0 1
## 118 36 0.33333333 0.4780914 0 0.0 1
## 119 18 0.27777778 0.4608886 0 0.0 1
## 120 13 0.15384615 0.3755338 0 0.0 1
## 121 15 0.20000000 0.4140393 0 0.0 1
## 122 45 0.22222222 0.4204375 0 0.0 1
## 123 17 0.11764706 0.3321056 0 0.0 1
## 124 39 0.25641026 0.4423590 0 0.0 1
## 125 21 0.23809524 0.4364358 0 0.0 1
## 126 32 0.12500000 0.3360108 0 0.0 1
## 127 19 0.15789474 0.3746343 0 0.0 1
## 128 35 0.22857143 0.4260430 0 0.0 1
## 129 19 0.21052632 0.4188539 0 0.0 1
## 130 49 0.18367347 0.3912304 0 0.0 1
## 131 22 0.18181818 0.3947710 0 0.0 1
## 132 28 0.25000000 0.4409586 0 0.0 1
## 133 21 0.14285714 0.3585686 0 0.0 1
## 134 52 0.26923077 0.4478876 0 0.0 1
## 135 23 0.17391304 0.3875534 0 0.0 1
## 136 35 0.28571429 0.4583492 0 0.0 1
## 138 38 0.21052632 0.4131550 0 0.0 1
## 139 21 0.33333333 0.4830459 0 0.0 1
## 140 39 0.28205128 0.4558808 0 0.0 1
## 141 21 0.19047619 0.4023739 0 0.0 1
## 142 15 0.26666667 0.4577377 0 0.0 1
## 143 18 0.11111111 0.3233808 0 0.0 1
## 144 41 0.26829268 0.4485750 0 0.0 1
## 145 25 0.32000000 0.4760952 0 0.0 1
## 146 42 0.26190476 0.4450006 0 0.0 1
## 147 14 0.21428571 0.4258153 0 0.0 1
## 148 30 0.23333333 0.4301831 0 0.0 1
## 149 24 0.12500000 0.3378320 0 0.0 1
## 150 38 0.31578947 0.4710691 0 0.0 1
## 151 18 0.33333333 0.4850713 0 0.0 1
## 152 43 0.25581395 0.4414814 0 0.0 1
## 153 14 0.35714286 0.4972452 0 0.0 1
## 154 34 0.14705882 0.3594906 0 0.0 1
## 155 21 0.23809524 0.4364358 0 0.0 1
## 156 47 0.34042553 0.4789752 0 0.0 1
## 157 26 0.26923077 0.4523443 0 0.0 1
## 158 41 0.14634146 0.3578390 0 0.0 1
## 160 48 0.18750000 0.3944428 0 0.0 1
## 161 15 0.13333333 0.3518658 0 0.0 1
## 162 32 0.21875000 0.4200134 0 0.0 1
## 163 25 0.20000000 0.4082483 0 0.0 1
## 164 18 0.22222222 0.4277926 0 0.0 1
## 165 15 0.26666667 0.4577377 0 0.0 1
## 166 59 0.25423729 0.4391693 0 0.0 1
## 167 15 0.20000000 0.4140393 0 0.0 1
## 168 33 0.36363636 0.4885042 0 0.0 1
## 170 24 0.12500000 0.3378320 0 0.0 1
## 172 16 0.18750000 0.4031129 0 0.0 1
## 174 26 0.34615385 0.4851645 0 0.0 1
## 176 17 0.17647059 0.3929526 0 0.0 1
## 178 28 0.25000000 0.4409586 0 0.0 1
## 180 28 0.21428571 0.4178554 0 0.0 1
## 182 34 0.26470588 0.4478111 0 0.0 1
## 184 24 0.29166667 0.4643056 0 0.0 1
## 188 25 0.32000000 0.4760952 0 0.0 1
## 190 15 0.20000000 0.4140393 0 0.0 1
## 192 37 0.35135135 0.4839775 0 0.0 1
## 194 18 0.16666667 0.3834825 0 0.0 1
## 196 26 0.26923077 0.4523443 0 0.0 1
## 198 21 0.23809524 0.4364358 0 0.0 1
## 200 22 0.18181818 0.3947710 0 0.0 1
## 202 18 0.22222222 0.4277926 0 0.0 1
## 204 30 0.16666667 0.3790490 0 0.0 1
## 206 22 0.13636364 0.3512501 0 0.0 1
## 210 26 0.19230769 0.4019185 0 0.0 1
## 212 14 0.21428571 0.4258153 0 0.0 1
## 214 30 0.23333333 0.4301831 0 0.0 1
## 216 15 0.26666667 0.4577377 0 0.0 1
## 218 20 0.20000000 0.4103913 0 0.0 1
## 220 15 0.26666667 0.4577377 0 0.0 1
## 222 29 0.17241379 0.3844259 0 0.0 1
## 224 20 0.15000000 0.3663475 0 0.0 1
## 226 18 0.22222222 0.4277926 0 0.0 1
## 228 6 0.50000000 0.5477226 0 0.5 1
## 232 14 0.14285714 0.3631365 0 0.0 1
## 234 4 0.00000000 0.0000000 0 0.0 0
## 236 7 0.00000000 0.0000000 0 0.0 0
## 238 4 0.00000000 0.0000000 0 0.0 0
## 240 5 0.20000000 0.4472136 0 0.0 1
## 242 11 0.00000000 0.0000000 0 0.0 0
## 244 8 0.25000000 0.4629100 0 0.0 1
## 246 12 0.16666667 0.3892495 0 0.0 1
## 248 11 0.27272727 0.4670994 0 0.0 1
## 250 12 0.16666667 0.3892495 0 0.0 1
## 254 9 0.11111111 0.3333333 0 0.0 1
## 256 13 0.00000000 0.0000000 0 0.0 0
## 258 12 0.41666667 0.5149287 0 0.0 1
## 260 8 0.12500000 0.3535534 0 0.0 1
## 262 10 0.10000000 0.3162278 0 0.0 1
## 264 15 0.13333333 0.3518658 0 0.0 1
## 266 6 0.16666667 0.4082483 0 0.0 1
## 268 11 0.45454545 0.5222330 0 0.0 1
## 270 7 0.42857143 0.5345225 0 0.0 1
## 272 5 0.20000000 0.4472136 0 0.0 1
## 276 5 0.00000000 0.0000000 0 0.0 0
## 278 13 0.00000000 0.0000000 0 0.0 0
## 280 6 0.33333333 0.5163978 0 0.0 1
## 282 8 0.25000000 0.4629100 0 0.0 1
## 284 7 0.28571429 0.4879500 0 0.0 1
## 286 5 0.60000000 0.5477226 0 1.0 1
## 288 11 0.09090909 0.3015113 0 0.0 1
## 290 7 0.57142857 0.5345225 0 1.0 1
## 292 10 0.30000000 0.4830459 0 0.0 1
## 294 6 0.00000000 0.0000000 0 0.0 0
## 298 13 0.46153846 0.5188745 0 0.0 1
## 300 11 0.09090909 0.3015113 0 0.0 1
## 302 8 0.37500000 0.5175492 0 0.0 1
## 304 12 0.16666667 0.3892495 0 0.0 1
## 306 11 0.18181818 0.4045199 0 0.0 1
## 308 14 0.28571429 0.4688072 0 0.0 1
## 310 6 0.00000000 0.0000000 0 0.0 0
## 312 11 0.09090909 0.3015113 0 0.0 1
## 314 12 0.16666667 0.3892495 0 0.0 1
## 316 11 0.27272727 0.4670994 0 0.0 1
## 320 9 0.11111111 0.3333333 0 0.0 1
## 322 12 0.08333333 0.2886751 0 0.0 1
## 324 9 0.22222222 0.4409586 0 0.0 1
## 326 10 0.00000000 0.0000000 0 0.0 0
## 328 10 0.20000000 0.4216370 0 0.0 1
## 330 10 0.10000000 0.3162278 0 0.0 1
## 332 10 0.40000000 0.5163978 0 0.0 1
## 334 11 0.18181818 0.4045199 0 0.0 1
## 336 11 0.27272727 0.4670994 0 0.0 1
# 建立回归模型
# 空模型
model.empty=glm(target~1,family=binomial(link=logit), data=data)
summary(model.empty)
##
## Call:
## glm(formula = target ~ 1, family = binomial(link = logit), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7567 -0.7567 -0.7567 -0.7567 1.6676
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.10406 0.01765 -62.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19259 on 17157 degrees of freedom
## Residual deviance: 19259 on 17157 degrees of freedom
## AIC: 19261
##
## Number of Fisher Scoring iterations: 4
# 全模型
model.full=glm(target~city_dev + as.factor(gender) + as.factor(expenience) + as.factor(university) + as.factor(edu_level) + as.factor(major) + as.factor(expenience_year_new) + as.factor(company_size) + as.factor(company_type) + as.factor(jobs) + train_hours,family=binomial(link=logit),data=data)
summary(model.full)
##
## Call:
## glm(formula = target ~ city_dev + as.factor(gender) + as.factor(expenience) +
## as.factor(university) + as.factor(edu_level) + as.factor(major) +
## as.factor(expenience_year_new) + as.factor(company_size) +
## as.factor(company_type) + as.factor(jobs) + train_hours,
## family = binomial(link = logit), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1736 -0.6840 -0.4640 -0.2109 2.7797
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 3.8608333 0.2068226 18.667
## city_dev -6.0094390 0.1588004 -37.843
## as.factor(gender)Other 0.1760703 0.0745309 2.362
## as.factor(expenience)No relevent experience 0.2598113 0.0522554 4.972
## as.factor(university)no_enrollment -0.2245314 0.0468793 -4.790
## as.factor(edu_level)High School -0.9418252 0.0750823 -12.544
## as.factor(edu_level)Masters -0.1794383 0.0492573 -3.643
## as.factor(edu_level)Phd -0.5081710 0.1672069 -3.039
## as.factor(edu_level)Primary School -1.4030985 0.1981243 -7.082
## as.factor(major)STEM -0.0778109 0.0686363 -1.134
## as.factor(expenience_year_new)2 -0.1739321 0.1103607 -1.576
## as.factor(expenience_year_new)3 -0.3552849 0.1099728 -3.231
## as.factor(company_size)10/49 0.3175236 0.1088392 2.917
## as.factor(company_size)100-500 -0.0328439 0.1031412 -0.318
## as.factor(company_size)1000-4999 0.0761153 0.1197424 0.636
## as.factor(company_size)10000+ 0.3110905 0.1065759 2.919
## as.factor(company_size)50-99 0.0387523 0.0991545 0.391
## as.factor(company_size)500-999 0.0690498 0.1308882 0.528
## as.factor(company_size)5000-9999 0.2637057 0.1477820 1.784
## as.factor(company_size)Other 1.4180197 0.0938440 15.110
## as.factor(company_type)Pvt Ltd -0.0637367 0.0597014 -1.068
## as.factor(jobs)1 0.0575809 0.0624454 0.922
## as.factor(jobs)2 0.1387070 0.0735944 1.885
## as.factor(jobs)3 0.1566176 0.1005800 1.557
## as.factor(jobs)4 0.2277876 0.1008498 2.259
## as.factor(jobs)never -0.4914202 0.0845609 -5.811
## train_hours -0.0008962 0.0003351 -2.675
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## city_dev < 2e-16 ***
## as.factor(gender)Other 0.01816 *
## as.factor(expenience)No relevent experience 6.63e-07 ***
## as.factor(university)no_enrollment 1.67e-06 ***
## as.factor(edu_level)High School < 2e-16 ***
## as.factor(edu_level)Masters 0.00027 ***
## as.factor(edu_level)Phd 0.00237 **
## as.factor(edu_level)Primary School 1.42e-12 ***
## as.factor(major)STEM 0.25693
## as.factor(expenience_year_new)2 0.11502
## as.factor(expenience_year_new)3 0.00124 **
## as.factor(company_size)10/49 0.00353 **
## as.factor(company_size)100-500 0.75015
## as.factor(company_size)1000-4999 0.52500
## as.factor(company_size)10000+ 0.00351 **
## as.factor(company_size)50-99 0.69593
## as.factor(company_size)500-999 0.59781
## as.factor(company_size)5000-9999 0.07435 .
## as.factor(company_size)Other < 2e-16 ***
## as.factor(company_type)Pvt Ltd 0.28570
## as.factor(jobs)1 0.35648
## as.factor(jobs)2 0.05946 .
## as.factor(jobs)3 0.11944
## as.factor(jobs)4 0.02390 *
## as.factor(jobs)never 6.19e-09 ***
## train_hours 0.00748 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19259 on 17157 degrees of freedom
## Residual deviance: 16043 on 17131 degrees of freedom
## AIC: 16097
##
## Number of Fisher Scoring iterations: 4
#计算空模型和全模型的方差分析
anova(model.empty, model.full)
## Analysis of Deviance Table
##
## Model 1: target ~ 1
## Model 2: target ~ city_dev + as.factor(gender) + as.factor(expenience) +
## as.factor(university) + as.factor(edu_level) + as.factor(major) +
## as.factor(expenience_year_new) + as.factor(company_size) +
## as.factor(company_type) + as.factor(jobs) + train_hours
## Resid. Df Resid. Dev Df Deviance
## 1 17157 19258
## 2 17131 16043 26 3215.7
19258-16003 # 似然函数,残差平方和
## [1] 3255
1-pchisq(3255.7, df=26) # P值为零,说明至少有一个变量起作用
## [1] 0
library(car) #载入程序包car
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Type III SS 在软件里一般显示为Adjust SS,指的是,将p个变量纳入回归模型后,各个变量的额外贡献度(独立贡献度),
# 一般来说,各个变量的SS之和是小于SSR的,仅当各个变量完全不相关时,各个变量的SS的和才等于SSR。相应地,可以求出Type III r^{2},即:
Anova(model.full, type="III") #对模型做三型方差分析
## Analysis of Deviance Table (Type III tests)
##
## Response: target
## LR Chisq Df Pr(>Chisq)
## city_dev 1517.56 1 < 2.2e-16 ***
## as.factor(gender) 5.48 1 0.019182 *
## as.factor(expenience) 24.51 1 7.401e-07 ***
## as.factor(university) 22.75 1 1.843e-06 ***
## as.factor(edu_level) 219.92 4 < 2.2e-16 ***
## as.factor(major) 1.28 1 0.258672
## as.factor(expenience_year_new) 19.20 2 6.786e-05 ***
## as.factor(company_size) 788.08 8 < 2.2e-16 ***
## as.factor(company_type) 1.13 1 0.286919
## as.factor(jobs) 85.55 5 < 2.2e-16 ***
## train_hours 7.25 1 0.007093 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 模型选择
# AIC模型
model.aic=step(model.full,trace=F)
summary(model.aic)
##
## Call:
## glm(formula = target ~ city_dev + as.factor(gender) + as.factor(expenience) +
## as.factor(university) + as.factor(edu_level) + as.factor(expenience_year_new) +
## as.factor(company_size) + as.factor(jobs) + train_hours,
## family = binomial(link = logit), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1762 -0.6848 -0.4635 -0.2121 2.7759
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 3.7314437 0.1885195 19.793
## city_dev -5.9798731 0.1574074 -37.990
## as.factor(gender)Other 0.1844614 0.0743442 2.481
## as.factor(expenience)No relevent experience 0.2668466 0.0520867 5.123
## as.factor(university)no_enrollment -0.2211558 0.0466287 -4.743
## as.factor(edu_level)High School -0.9516658 0.0746569 -12.747
## as.factor(edu_level)Masters -0.1772305 0.0492190 -3.601
## as.factor(edu_level)Phd -0.4956321 0.1666211 -2.975
## as.factor(edu_level)Primary School -1.4147567 0.1978204 -7.152
## as.factor(expenience_year_new)2 -0.1769660 0.1103637 -1.603
## as.factor(expenience_year_new)3 -0.3618494 0.1098322 -3.295
## as.factor(company_size)10/49 0.3104811 0.1086561 2.857
## as.factor(company_size)100-500 -0.0466643 0.1024966 -0.455
## as.factor(company_size)1000-4999 0.0581195 0.1187263 0.490
## as.factor(company_size)10000+ 0.2869538 0.1047285 2.740
## as.factor(company_size)50-99 0.0244237 0.0984360 0.248
## as.factor(company_size)500-999 0.0538645 0.1302738 0.413
## as.factor(company_size)5000-9999 0.2446250 0.1470373 1.664
## as.factor(company_size)Other 1.3925382 0.0905437 15.380
## as.factor(jobs)1 0.0586842 0.0622992 0.942
## as.factor(jobs)2 0.1405328 0.0735056 1.912
## as.factor(jobs)3 0.1573387 0.1005466 1.565
## as.factor(jobs)4 0.2280631 0.1007859 2.263
## as.factor(jobs)never -0.4960657 0.0844113 -5.877
## train_hours -0.0008947 0.0003350 -2.671
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## city_dev < 2e-16 ***
## as.factor(gender)Other 0.013095 *
## as.factor(expenience)No relevent experience 3.01e-07 ***
## as.factor(university)no_enrollment 2.11e-06 ***
## as.factor(edu_level)High School < 2e-16 ***
## as.factor(edu_level)Masters 0.000317 ***
## as.factor(edu_level)Phd 0.002934 **
## as.factor(edu_level)Primary School 8.57e-13 ***
## as.factor(expenience_year_new)2 0.108829
## as.factor(expenience_year_new)3 0.000986 ***
## as.factor(company_size)10/49 0.004270 **
## as.factor(company_size)100-500 0.648910
## as.factor(company_size)1000-4999 0.624470
## as.factor(company_size)10000+ 0.006144 **
## as.factor(company_size)50-99 0.804043
## as.factor(company_size)500-999 0.679261
## as.factor(company_size)5000-9999 0.096174 .
## as.factor(company_size)Other < 2e-16 ***
## as.factor(jobs)1 0.346206
## as.factor(jobs)2 0.055894 .
## as.factor(jobs)3 0.117622
## as.factor(jobs)4 0.023645 *
## as.factor(jobs)never 4.18e-09 ***
## train_hours 0.007574 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19259 on 17157 degrees of freedom
## Residual deviance: 16045 on 17133 degrees of freedom
## AIC: 16095
##
## Number of Fisher Scoring iterations: 4
# BIC模型
ss=length(data[,1])
model.bic=step(model.full,trace=F,k=log(ss))
summary(model.bic)
##
## Call:
## glm(formula = target ~ city_dev + as.factor(expenience) + as.factor(university) +
## as.factor(edu_level) + as.factor(expenience_year_new) + as.factor(company_size) +
## as.factor(jobs), family = binomial(link = logit), data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2130 -0.6827 -0.4640 -0.2134 2.7889
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 3.67369 0.18718 19.626
## city_dev -5.95251 0.15700 -37.915
## as.factor(expenience)No relevent experience 0.27010 0.05205 5.189
## as.factor(university)no_enrollment -0.21964 0.04660 -4.713
## as.factor(edu_level)High School -0.95901 0.07461 -12.853
## as.factor(edu_level)Masters -0.17200 0.04916 -3.499
## as.factor(edu_level)Phd -0.48637 0.16625 -2.925
## as.factor(edu_level)Primary School -1.42518 0.19793 -7.200
## as.factor(expenience_year_new)2 -0.18549 0.11036 -1.681
## as.factor(expenience_year_new)3 -0.37586 0.10977 -3.424
## as.factor(company_size)10/49 0.31309 0.10860 2.883
## as.factor(company_size)100-500 -0.04496 0.10250 -0.439
## as.factor(company_size)1000-4999 0.06477 0.11867 0.546
## as.factor(company_size)10000+ 0.29080 0.10471 2.777
## as.factor(company_size)50-99 0.02545 0.09842 0.259
## as.factor(company_size)500-999 0.05694 0.13024 0.437
## as.factor(company_size)5000-9999 0.24754 0.14705 1.683
## as.factor(company_size)Other 1.39312 0.09052 15.390
## as.factor(jobs)1 0.05821 0.06225 0.935
## as.factor(jobs)2 0.13866 0.07346 1.888
## as.factor(jobs)3 0.15650 0.10047 1.558
## as.factor(jobs)4 0.22723 0.10071 2.256
## as.factor(jobs)never -0.49484 0.08435 -5.866
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## city_dev < 2e-16 ***
## as.factor(expenience)No relevent experience 2.11e-07 ***
## as.factor(university)no_enrollment 2.44e-06 ***
## as.factor(edu_level)High School < 2e-16 ***
## as.factor(edu_level)Masters 0.000468 ***
## as.factor(edu_level)Phd 0.003439 **
## as.factor(edu_level)Primary School 6.00e-13 ***
## as.factor(expenience_year_new)2 0.092793 .
## as.factor(expenience_year_new)3 0.000617 ***
## as.factor(company_size)10/49 0.003941 **
## as.factor(company_size)100-500 0.660949
## as.factor(company_size)1000-4999 0.585246
## as.factor(company_size)10000+ 0.005481 **
## as.factor(company_size)50-99 0.795930
## as.factor(company_size)500-999 0.661951
## as.factor(company_size)5000-9999 0.092294 .
## as.factor(company_size)Other < 2e-16 ***
## as.factor(jobs)1 0.349741
## as.factor(jobs)2 0.059081 .
## as.factor(jobs)3 0.119298
## as.factor(jobs)4 0.024048 *
## as.factor(jobs)never 4.46e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 19259 on 17157 degrees of freedom
## Residual deviance: 16059 on 17135 degrees of freedom
## AIC: 16105
##
## Number of Fisher Scoring iterations: 4
# 加载ROC类似
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pred.full=predict(model.full,data=data)
pred.aic=predict(model.aic,data=data)
pred.bic=predict(model.bic,data=data)
roc.full=roc(data$target,pred.full)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.aic=roc(data$target,pred.aic)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.bic=roc(data$target,pred.bic)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# 查看3个模型的结果, 目前全模型效果最好 0.7812451 0.7811417 0.7805449
print(c(roc.full$auc,roc.aic$auc,roc.bic$auc))
## [1] 0.7812451 0.7811417 0.7805449
par(mfrow=c(1,3))
plot(roc.full,main="Full model", col=1)
plot(roc.aic,main="AIC", col=2)
plot(roc.bic,main="BIC", col=3)
# 模型预测,先用AIC查看一下效果
par(mfrow=c(1,1))
data_train$company_size=as.factor(data_train$company_size)
p = predict(model.aic, data_train) #利用模型aic对测试数据进行预测
p = exp(p) / (1+exp(p)) #计算预测得到的概率
data_train$pred=1*(p>0.249) #以均值0.249为阈值生成预测值
table(data_train[,c("target","pred")]) #计算预测值与真实值的2维频数表
## pred
## target 0 1
## 0 1095 400
## 1 134 371
ngrids=500 #设置格点数为500
TPR=rep(0,ngrids) #为TPR(true positive ratio)赋初值
FPR=rep(0,ngrids) #为FPR(false positive ratio)赋初值
for(i in 1:ngrids){
p0=i/ngrids; #选取阈值p0
ST.true=data_train$target #取出真实值并赋值给ST.true
ST.pred=1*(p>p0) #以p0为阈值生成预测值
TPR[i]=sum(ST.pred*ST.true)/sum(ST.true) #计算TPR
FPR[i]=sum(ST.pred*(1-ST.true))/sum(1-ST.true) #计算FPR
}
plot(FPR,TPR,type="l",col=2) #画出FPR与TPR的散点图,即ROC曲线
points(c(0,1),c(0,1),type="l",lty=2) #添加对角线
# 全模型 vs AIC vs BIC
p=matrix(0,length(data_train[,1]),3) #生成矩阵,用于存储各模型的预测值
p[,1]=predict(model.full,data_train) #利用全模型对数据进行预测
p[,2]=predict(model.aic,data_train) #利用模型logit.aic对数据进行预测
p[,3]=predict(model.bic,data_train) #利用模型logit.bic对数据进行预测
p[,c(1:3)]=exp(p[,c(1:3)])/(1+exp(p[,c(1:3)])) #计算预测得到的概率
plot(c(0,1),c(0,1),type="l",main="FPR vs. TPR",xlab="FPR",ylab="TPR") #画图,生成基本框架
FPR=rep(0,ngrids) #为FPR赋初值
TPR=rep(0,ngrids) #为TPR赋初值
for(k in 1:3){
prob=p[,k] #取出p中第K列的值,即第K个模型的预测概率
for(i in 1:ngrids){
p0=i/ngrids #选取阈值
ST.hat=1*(prob>p0) #根据阈值生成预测值
FPR[i]=sum((1-ST.true)*ST.hat)/sum(1-ST.true) #计算FPR
TPR[i]=sum(ST.true*ST.hat)/sum(ST.true) #计算TPR
}
points(FPR,TPR,type="b",col=k,lty=k,pch=k) #向图上添加第k个模型的TPR与FPR的散点图
}
legend(0.6,0.3,c("LOGIT FULL MODEL","LOGIT AIC MODEL", "LOGIT BIC MODEL"),lty=c(1:3),col=c(1:3),pch=c(1:3)) #为3个模型添加标示,区分3个模型